1 package org.apache.lucene.search.spell;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.util.Comparator;
22 import java.util.PriorityQueue;
23 import java.util.Queue;
24
25 import org.apache.lucene.index.IndexReader;
26 import org.apache.lucene.index.Term;
27 import org.apache.lucene.search.spell.SuggestMode;
28 import org.apache.lucene.util.BytesRef;
29
30
31
32
33
34
35
36 public class WordBreakSpellChecker {
37 private int minSuggestionFrequency = 1;
38 private int minBreakWordLength = 1;
39 private int maxCombineWordLength = 20;
40 private int maxChanges = 1;
41 private int maxEvaluations = 1000;
42
43
44 public static final Term SEPARATOR_TERM = new Term("", "");
45
46
47
48
49
50
51
52
53
54 public WordBreakSpellChecker() {}
55
56
57
58
59
60
61 public enum BreakSuggestionSortMethod {
62
63
64
65
66
67
68 NUM_CHANGES_THEN_SUMMED_FREQUENCY,
69
70
71
72
73
74
75 NUM_CHANGES_THEN_MAX_FREQUENCY
76 }
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93 public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
94 IndexReader ir, SuggestMode suggestMode,
95 BreakSuggestionSortMethod sortMethod) throws IOException {
96 if (maxSuggestions < 1) {
97 return new SuggestWord[0][0];
98 }
99 if (suggestMode == null) {
100 suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
101 }
102 if (sortMethod == null) {
103 sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
104 }
105
106 int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
107 Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
108 : new LengthThenSumFreqComparator();
109 Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>(
110 queueInitialCapacity, queueComparator);
111
112 int origFreq = ir.docFreq(term);
113 if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
114 return new SuggestWord[0][];
115 }
116
117 int useMinSuggestionFrequency = minSuggestionFrequency;
118 if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
119 useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
120 }
121
122 generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
123 useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
124 sortMethod);
125
126 SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
127 for (int i = suggestions.size() - 1; i >= 0; i--) {
128 suggestionArray[i] = suggestions.remove().suggestWords;
129 }
130
131 return suggestionArray;
132 }
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162 public CombineSuggestion[] suggestWordCombinations(Term[] terms,
163 int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
164 throws IOException {
165 if (maxSuggestions < 1) {
166 return new CombineSuggestion[0];
167 }
168
169 int[] origFreqs = null;
170 if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
171 origFreqs = new int[terms.length];
172 for (int i = 0; i < terms.length; i++) {
173 origFreqs[i] = ir.docFreq(terms[i]);
174 }
175 }
176
177 int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
178 Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
179 Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<>(
180 queueInitialCapacity, queueComparator);
181
182 int thisTimeEvaluations = 0;
183 for (int i = 0; i < terms.length - 1; i++) {
184 if (terms[i].equals(SEPARATOR_TERM)) {
185 continue;
186 }
187 String leftTermText = terms[i].text();
188 int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
189 if (leftTermLength > maxCombineWordLength) {
190 continue;
191 }
192 int maxFreq = 0;
193 int minFreq = Integer.MAX_VALUE;
194 if (origFreqs != null) {
195 maxFreq = origFreqs[i];
196 minFreq = origFreqs[i];
197 }
198 String combinedTermText = leftTermText;
199 int combinedLength = leftTermLength;
200 for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
201 if (terms[j].equals(SEPARATOR_TERM)) {
202 break;
203 }
204 String rightTermText = terms[j].text();
205 int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
206 combinedTermText += rightTermText;
207 combinedLength +=rightTermLength;
208 if (combinedLength > maxCombineWordLength) {
209 break;
210 }
211
212 if (origFreqs != null) {
213 maxFreq = Math.max(maxFreq, origFreqs[j]);
214 minFreq = Math.min(minFreq, origFreqs[j]);
215 }
216
217 Term combinedTerm = new Term(terms[0].field(), combinedTermText);
218 int combinedTermFreq = ir.docFreq(combinedTerm);
219
220 if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
221 || combinedTermFreq >= maxFreq) {
222 if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
223 || minFreq == 0) {
224 if (combinedTermFreq >= minSuggestionFrequency) {
225 int[] origIndexes = new int[j - i + 1];
226 origIndexes[0] = i;
227 for (int k = 1; k < origIndexes.length; k++) {
228 origIndexes[k] = i + k;
229 }
230 SuggestWord word = new SuggestWord();
231 word.freq = combinedTermFreq;
232 word.score = origIndexes.length - 1;
233 word.string = combinedTerm.text();
234 CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
235 new CombineSuggestion(word, origIndexes),
236 (origIndexes.length - 1));
237 suggestions.offer(suggestion);
238 if (suggestions.size() > maxSuggestions) {
239 suggestions.poll();
240 }
241 }
242 }
243 }
244 thisTimeEvaluations++;
245 if (thisTimeEvaluations == maxEvaluations) {
246 break;
247 }
248 }
249 }
250 CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
251 .size()];
252 for (int i = suggestions.size() - 1; i >= 0; i--) {
253 combineSuggestions[i] = suggestions.remove().combineSuggestion;
254 }
255 return combineSuggestions;
256 }
257
258 private int generateBreakUpSuggestions(Term term, IndexReader ir,
259 int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
260 SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
261 int totalEvaluations, BreakSuggestionSortMethod sortMethod)
262 throws IOException {
263 String termText = term.text();
264 int termLength = termText.codePointCount(0, termText.length());
265 int useMinBreakWordLength = minBreakWordLength;
266 if (useMinBreakWordLength < 1) {
267 useMinBreakWordLength = 1;
268 }
269 if (termLength < (useMinBreakWordLength * 2)) {
270 return 0;
271 }
272
273 int thisTimeEvaluations = 0;
274 for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
275 int end = termText.offsetByCodePoints(0, i);
276 String leftText = termText.substring(0, end);
277 String rightText = termText.substring(end);
278 SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
279
280 if (leftWord.freq >= useMinSuggestionFrequency) {
281 SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
282 if (rightWord.freq >= useMinSuggestionFrequency) {
283 SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
284 newSuggestion(prefix, leftWord, rightWord));
285 suggestions.offer(suggestion);
286 if (suggestions.size() > maxSuggestions) {
287 suggestions.poll();
288 }
289 }
290 int newNumberBreaks = numberBreaks + 1;
291 if (newNumberBreaks <= maxChanges) {
292 int evaluations = generateBreakUpSuggestions(new Term(term.field(),
293 rightWord.string), ir, newNumberBreaks, maxSuggestions,
294 useMinSuggestionFrequency, newPrefix(prefix, leftWord),
295 suggestions, totalEvaluations, sortMethod);
296 totalEvaluations += evaluations;
297 }
298 }
299
300 thisTimeEvaluations++;
301 totalEvaluations++;
302 if (totalEvaluations >= maxEvaluations) {
303 break;
304 }
305 }
306 return thisTimeEvaluations;
307 }
308
309 private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
310 SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
311 System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
312 newPrefix[newPrefix.length - 1] = append;
313 return newPrefix;
314 }
315
316 private SuggestWord[] newSuggestion(SuggestWord[] prefix,
317 SuggestWord append1, SuggestWord append2) {
318 SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
319 int score = prefix.length + 1;
320 for (int i = 0; i < prefix.length; i++) {
321 SuggestWord word = new SuggestWord();
322 word.string = prefix[i].string;
323 word.freq = prefix[i].freq;
324 word.score = score;
325 newSuggestion[i] = word;
326 }
327 append1.score = score;
328 append2.score = score;
329 newSuggestion[newSuggestion.length - 2] = append1;
330 newSuggestion[newSuggestion.length - 1] = append2;
331 return newSuggestion;
332 }
333
334 private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
335 Term term = new Term(fieldname, text);
336 int freq = ir.docFreq(term);
337 SuggestWord word = new SuggestWord();
338 word.freq = freq;
339 word.score = 1;
340 word.string = text;
341 return word;
342 }
343
344
345
346
347
348
349 public int getMinSuggestionFrequency() {
350 return minSuggestionFrequency;
351 }
352
353
354
355
356
357 public int getMaxCombineWordLength() {
358 return maxCombineWordLength;
359 }
360
361
362
363
364
365 public int getMinBreakWordLength() {
366 return minBreakWordLength;
367 }
368
369
370
371
372
373 public int getMaxChanges() {
374 return maxChanges;
375 }
376
377
378
379
380
381 public int getMaxEvaluations() {
382 return maxEvaluations;
383 }
384
385
386
387
388
389
390
391
392
393
394 public void setMinSuggestionFrequency(int minSuggestionFrequency) {
395 this.minSuggestionFrequency = minSuggestionFrequency;
396 }
397
398
399
400
401
402
403
404
405
406 public void setMaxCombineWordLength(int maxCombineWordLength) {
407 this.maxCombineWordLength = maxCombineWordLength;
408 }
409
410
411
412
413
414
415
416
417 public void setMinBreakWordLength(int minBreakWordLength) {
418 this.minBreakWordLength = minBreakWordLength;
419 }
420
421
422
423
424
425
426
427
428
429 public void setMaxChanges(int maxChanges) {
430 this.maxChanges = maxChanges;
431 }
432
433
434
435
436
437
438
439
440
441
442 public void setMaxEvaluations(int maxEvaluations) {
443 this.maxEvaluations = maxEvaluations;
444 }
445
446 private class LengthThenMaxFreqComparator implements
447 Comparator<SuggestWordArrayWrapper> {
448 @Override
449 public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
450 if (o1.suggestWords.length != o2.suggestWords.length) {
451 return o2.suggestWords.length - o1.suggestWords.length;
452 }
453 if (o1.freqMax != o2.freqMax) {
454 return o1.freqMax - o2.freqMax;
455 }
456 return 0;
457 }
458 }
459
460 private class LengthThenSumFreqComparator implements
461 Comparator<SuggestWordArrayWrapper> {
462 @Override
463 public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
464 if (o1.suggestWords.length != o2.suggestWords.length) {
465 return o2.suggestWords.length - o1.suggestWords.length;
466 }
467 if (o1.freqSum != o2.freqSum) {
468 return o1.freqSum - o2.freqSum;
469 }
470 return 0;
471 }
472 }
473
474 private class CombinationsThenFreqComparator implements
475 Comparator<CombineSuggestionWrapper> {
476 @Override
477 public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
478 if (o1.numCombinations != o2.numCombinations) {
479 return o2.numCombinations - o1.numCombinations;
480 }
481 if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
482 return o1.combineSuggestion.suggestion.freq
483 - o2.combineSuggestion.suggestion.freq;
484 }
485 return 0;
486 }
487 }
488
489 private class SuggestWordArrayWrapper {
490 final SuggestWord[] suggestWords;
491 final int freqMax;
492 final int freqSum;
493
494 SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
495 this.suggestWords = suggestWords;
496 int aFreqSum = 0;
497 int aFreqMax = 0;
498 for (SuggestWord sw : suggestWords) {
499 aFreqSum += sw.freq;
500 aFreqMax = Math.max(aFreqMax, sw.freq);
501 }
502 this.freqSum = aFreqSum;
503 this.freqMax = aFreqMax;
504 }
505 }
506
507 private class CombineSuggestionWrapper {
508 final CombineSuggestion combineSuggestion;
509 final int numCombinations;
510
511 CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
512 int numCombinations) {
513 this.combineSuggestion = combineSuggestion;
514 this.numCombinations = numCombinations;
515 }
516 }
517 }